library(rmapshaper)
library(censusapi)
library(shapefiles)
library(base)
library(tidyverse)
library(tigris)
library(sf)
library(mapview)
library(raster)
library(rlang)
library(rgeos)
library(data.table)
library(measurements)
library(smoothr)
library(lwgeom)
library(units)
library(geosphere)
library(kableExtra)
library(leaflet)
library(htmltools)
library(plotly)
library(osrm)


mapviewOptions(
  basemaps = "OpenStreetMap"
)
options(
  tigris_class = "sf",
  tigris_use_cache = TRUE,
  osrm.server = "http://127.0.0.1:5000/",
  osrm.profile = "walk"
)

Sys.setenv(CENSUS_KEY="b88495f8315f45b2d100dee9ba8f4c489a7371c2")

projection <- "+proj=utm +zone=10 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=ft +no_defs"

`%notin%` <- Negate(`%in%`)

Introduction

Does the location of where one lives impact the ability to enjoy outdoor green space? Does the size of the yard at your house play a factor in whether you regularly visit parks? Are communities under-visiting parks that are physically closer to them? The goal of this analysis is to provide insight on park accessibility within communities in the City of San Jose. We will consider factors such as backyard size, park visit frequency, and distance from a park to determine a community’s park accessibility.

The first few sections describe our methodological approaches for preparing the factors for analysis, followed by sections detailing the comparisons between factors and the resulting conclusions.

Safegraph Data Collection Explanation

The following describes how the Safegraph data is processed:

Safegraph collects device visit information to “places of interest” (POI) like parks or grocery stores and includes some information about a device’s origin census block group. The visit counts in the Safegraph dataset fall into either one of these three cases:

Note: We obtained all of the data for the park visits portion of the analysis from Safegraph Patterns Data.

Methodology: Collecting San Jose Residential Yard Sizes

Our goal for this section is to obtain the average residential yard size for each census block group in San Jose. We start with parcel data from the Santa Clara County Assessor’s Office, and filter it down to only include parcels with residential land use codes. The following list details the land use codes we used for this analysis:

After filtering to parcels with these residential use codes, we spatially match the building footprint shape associated with that parcel. To get from parcel area to yard area, we subtract the building footprint shape out of the parcel shape.

#load("P:/SFBI/Data Library/OSM/osm_bldg.Rdata")
# 
#osm_bldg <-
#  osm_bldg %>% st_transform(projection)

sj_boundary <-
  places("CA", cb=FALSE) %>%
  filter(NAME == "San Jose") %>%
  st_transform(projection)
# 
sj_blockgroup <-
  readRDS("sj_blockgroups.rds") %>%
  st_transform(projection)

#mapview(sj_blockgroup)


#load("P:/SFBI/Data Library/California Blocks/ca_blocks_lite.Rdata")
#ca_blocks_lite <- ca_blocks_lite %>% st_transform(projection)
#sj_blocks <- ca_blocks_lite[sj_boundary,]
#save(sj_blocks, file = "sj_blocks.Rdata")
#load("sj_blocks.Rdata")

#mapview(sj_blocks[100,])

# sj_bldg <-
#   osm_bldg[sj_boundary,] %>% 
#   transmute(
#     index = row_number()
#   )

#save(sj_bldg, file = "sj_bldg.Rdata")
load("sj_bldg.Rdata")

# scc_parcels <- st_read("P:/SFBI/Data Library/Parcels/santa clara/scc.shp")
# 
# scc_parcels <- 
#   scc_parcels %>%
#   st_transform(projection)
# 
# This includes parcels parcels at the periphery of San Jose. This is important for the "block method" used to find street edges. otherwise this sf object isn't used.
# sj_parcels_plus <-
#   scc_parcels[sj_blocks,] %>%
#   dplyr::select(APN, USE_CODE, SITUS_CITY)
# # 
# sj_parcels <-
#   sj_parcels_plus %>%
#   filter(SITUS_CITY == "SAN JOSE")
# #
# save(sj_parcels_plus, sj_parcels, file = "sj_parcels.Rdata")

load("sj_parcels.Rdata")


# apn_bg_lookup <- data.frame(APN = NA, origin_census_block_group = NA)
# 
# for(i in 1:nrow(sj_blockgroup)){
#   if(i %% 10 == 0){print(i)}
#     temp <- sj_blockgroup[i,]
#     temp2 <- sj_parcels[temp,] %>%
#       dplyr::select(APN) %>%
#       st_set_geometry(NULL) %>%
#       mutate(
#         origin_census_block_group = temp$origin_census_block_group
#       )
#     apn_bg_lookup <-
#       apn_bg_lookup %>%
#       rbind(temp2)
# }
# 
# apn_bg_lookup <-
#   apn_bg_lookup %>%
#   na.omit()
# 
# saveRDS(apn_bg_lookup,"sj_apn_bg_lookup.rds")

apn_bg_lookup <- readRDS("sj_apn_bg_lookup.rds")



sj_parcels_residential <-
  sj_parcels %>%
  mutate(
    USE_CODE = as.numeric(USE_CODE)
  ) %>%
  filter(USE_CODE %in% c(1,2,3,4,6,7))


#mapview(head(sj_parcels_residential,100))

matched_bldg_footprint <-
  sj_bldg %>%
  st_centroid() %>%
  st_intersection(sj_parcels_residential) %>%
  st_set_geometry(NULL) %>%
  left_join(sj_bldg, by = "index") %>%
  st_as_sf()
# 
# 
residential_parcels <-
  sj_parcels_residential %>%
  mutate(
     parcel_area = st_area(.) %>% as.numeric()
  ) %>%
  as.data.frame() %>%
  st_as_sf() %>%
  rename(parcel_geometry = geometry)
# 
# 
residential_bldg <-
  residential_parcels %>%
  as.data.frame() %>%
  right_join(
    matched_bldg_footprint %>% dplyr::select(APN),
    by = "APN"
  ) %>%
  filter(!is.na(parcel_area)) %>%
  rename(building_geometry = geometry) %>%
  st_as_sf() %>%
  st_set_geometry("building_geometry")
# 
# 
residential_parcels <-
  residential_bldg %>%
  st_set_geometry("parcel_geometry")
# 
residential_parcels <- residential_parcels[!duplicated(residential_parcels), ]

##############################################################
# available_yard <-
#   st_difference(
#     residential_parcels[1:10000,],
#     st_union(
#       residential_parcels$building_geometry[1:10000,])
#  )
# 
# available_yard2 <-
#   st_difference(
#     residential_parcels[10001:20000,],
#     st_union(
#       residential_parcels$building_geometry[10001:20000,])
#  )
# 
# available_yard3 <-
#   st_difference(
#     residential_parcels[20001:30000,],
#     st_union(
#       residential_parcels$building_geometry[20001:30000,])
#  )
# #
# available_yard4 <-
#   st_difference(
#     residential_parcels[30001:40000,],
#     st_union(
#       residential_parcels$building_geometry[30001:40000,])
#  )
# 
# available_yard5 <-
#   st_difference(
#     residential_parcels[40001:50000,],
#     st_union(
#       residential_parcels$building_geometry[40001:50000,])
#  )
# 
# available_yard6 <-
#   st_difference(
#     residential_parcels[50001:60000,],
#     st_union(
#       residential_parcels$building_geometry[50001:60000,])
#  )
# 
# available_yard7 <-
#   st_difference(
#     residential_parcels[60001:70000,],
#     st_union(
#       residential_parcels$building_geometry[60001:70000,])
#  )
# 
# available_yard8 <-
#   st_difference(
#     residential_parcels[70001:80000,],
#     st_union(
#       residential_parcels$building_geometry[70001:80000,])
#  )
# 
# available_yard9 <-
#   st_difference(
#     residential_parcels[80001:90000,],
#     st_union(
#       residential_parcels$building_geometry[80001:90000,])
#  )
# 
# available_yard10 <-
#   st_difference(
#     residential_parcels[90001:100000,],
#     st_union(
#       residential_parcels$building_geometry[90001:100000,])
#  )
# 
# available_yard10 <-
#   st_difference(
#     residential_parcels[100001:nrow(residential_parcels),],
#     st_union(
#       residential_parcels$building_geometry[100001:nrow(residential_parcels),])
#  )


# all_available_yard <-
#   available_yard %>%
#   rbind(available_yard2) %>%
#   rbind(available_yard3) %>%
#   rbind(available_yard4) %>%
#   rbind(available_yard5) %>%
#   rbind(available_yard6) %>%
#   rbind(available_yard7) %>%
#   rbind(available_yard8) %>%
#   rbind(available_yard9) %>%
#   rbind(available_yard10)



#save(all_available_yard, file = "sj_all_yard_available.Rdata")

load("sj_all_yard_available.Rdata")


all_available_yard_other <-
  all_available_yard %>%
  filter(USE_CODE == 2 | USE_CODE == 3)

all_available_yard_apartment <-
  all_available_yard %>%
  filter(USE_CODE == 4) 
  
all_available_yard_other2 <-
  all_available_yard %>%
  filter(USE_CODE %in% c(6,7))

available_area_fun <- function(df, residential_parcels){
  df %>%
    as.data.frame() %>%
    rename(geometry = parcel_geometry) %>%
    left_join(residential_parcels %>% dplyr::select(APN), by = "APN") %>%
    st_as_sf() %>%
    st_set_geometry("geometry") %>%
    as_Spatial() %>%
    gBuffer(byid = T, width = 0) %>%
    sp::disaggregate() %>%
    st_as_sf() %>%
    rename(available_area_geometry = geometry) %>%
    st_set_geometry("available_area_geometry") %>%
    mutate(
      available_area = round(st_area(.) %>% as.numeric())
    ) %>%
    st_set_crs(projection)
}


# save(available_area, file = "sj_available_backyard_area.Rdata")

#!!!!This .Rdata file constians all of the yard areas for parcels zoned as single-family residential!!!!
load("sj_available_backyard_area.Rdata")
 

# PUC 2,3
available_area1_other <- available_area_fun(all_available_yard_other[1:nrow(all_available_yard_other),], residential_parcels)

available_area1_other <-
  available_area1_other %>%
  dplyr::select(APN, available_area, available_area_geometry) 


#PUC 6,7
available_area2_other <- available_area_fun(all_available_yard_other2, residential_parcels)

available_area2_other <-
  available_area2_other %>%
  dplyr::select(APN, available_area, available_area_geometry) 



available_area_apartment <-
  all_available_yard_apartment %>%
  mutate(
    bldg_area = st_area(building_geometry)
    ) %>%
  group_by(APN, parcel_area) %>%
  summarize(bldg_area_sum = sum(as.numeric(bldg_area))) %>%
  mutate(available_area = parcel_area - bldg_area_sum)

available_area_apartment <-
  available_area_apartment %>%
  rename(available_area_geometry = building_geometry) %>%
  dplyr::select(APN,available_area,available_area_geometry) %>%
  mutate(
    available_area = ifelse(available_area < 0,0,available_area)
  )

#combining all PUC dataframes
available_area_combo <-
  available_area %>%
  rbind(available_area1_other) %>%
  rbind(available_area2_other) %>%
  rbind(available_area_apartment)


nix_list <- c("69403028","41425018","67013004","67013003","67029024","74207011")


nix_df <-
  available_area %>%
  filter(APN %in% nix_list) %>%
  left_join(
    all_available_yard %>%
      st_set_geometry(NULL) %>%
      dplyr::select(APN, USE_CODE) %>%
      filter(APN %in% nix_list),
    by = "APN"
    )


available_area_bg <-
  available_area_combo %>%
  left_join(
    apn_bg_lookup %>%
      filter(APN %in% available_area$APN),
    by = "APN"
  ) %>%
  filter(APN %notin% nix_list)

#  
bg_area_avg <-
  available_area_bg %>%
  st_set_geometry(NULL) %>%
  group_by(origin_census_block_group) %>%
  summarise(yard_area_avg = sum(available_area))
# 
extra_bg <-
  sj_blockgroup %>%
  filter(origin_census_block_group %notin% bg_area_avg$origin_census_block_group) %>%
  st_set_geometry(NULL) %>%
  dplyr::select(origin_census_block_group) %>%
  mutate(yard_area_avg = 0)
# 
bg_area_avg_all <-
  bg_area_avg %>%
  rbind(extra_bg) %>%
  left_join(
    sj_blockgroup %>%
      dplyr::select(-DISTRICTS),
    by = "origin_census_block_group"
  ) %>%
  na.omit() %>%
  st_as_sf() %>%
  st_transform(4326)

  

#save(bg_area_avg_all, file = "sj_bg_avg_yard_area.Rdata")
load("sj_bg_avg_yard_area.Rdata")

sj_population <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*",
    regionin = "state:06+county:085",
    vars = c("group(B01003)")
  ) %>%
  mutate(
    origin_census_block_group =
      paste0(state,county,tract,block_group)
  ) %>%
  rename(
    total_pop = "B01003_001E"
  ) %>%
  dplyr::select(total_pop, origin_census_block_group)

sj_yard_pop_weight <-
  bg_area_avg_all %>%
  left_join(
    sj_population %>%
      filter(origin_census_block_group %in% bg_area_avg_all$origin_census_block_group),
    by = "origin_census_block_group"
  ) %>%
  mutate(
    yard_per_capita = yard_area_avg / total_pop
  )

The following map shows the void building footprint within the parcel shape for one block group. The resulting shape represents the yard area for each parcel.

example_bg_yard <-
  available_area_bg %>%
  filter(origin_census_block_group == "060855006002")

mapview(example_bg_yard)

In order to align our geographic granularity with the other factors of our analysis, we take the sum of all of the yard sizes within a block group. This value is then divided by the block group population to obtain the yard area per capita. The following map shows the yard area per capita sizes for all of the block groups in San Jose. Notice that the largest values occur near the left border of the city, near the hills.

bg_area_avg_minus <-
  sj_yard_pop_weight 

blue_pal2 <- colorNumeric(
  palette = colorRamp(c("#FFFFFF", "#00288c"), interpolate="spline"),
  domain =
    bg_area_avg_minus %>%
    pull(yard_per_capita) %>%
    unique()
)

yard_bg_map_minus <-
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(
    data = sj_boundary %>% st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = bg_area_avg_minus,
    fillColor = ~blue_pal2(yard_per_capita),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    #group = "Average Daily Park Visits",
    label = ~paste0("Block Group Average Yard Size per Capita: ",round(yard_per_capita), " square feet"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )) %>%
  addLegend(
    position = "topright", 
    pal = blue_pal2, 
    values = bg_area_avg_minus$yard_per_capita,
    title = "Avg. Yard Area per Capita (sqft.)",
    opacity = 1
  )

yard_bg_map_minus

Note: The data from the Santa Clara County assessor’s office likely includes multiple zoning misclassifications. For example, the following parcel has a Property Use Code of 1 (single-family residential), but it is actually a water treatment plant. This “yard” (that has an area of ~one million square feet) was inflating the yard size for the block group in which it is located.

mapview(nix_df[7,])

We did a visual inspection of the single-family residential parcels in the block groups with large average yard sizes (over 30,000 sqft.), and removed the parcels that appeared to be misclassified. Keep in mind that there still may be other misclassified parcels that are being incorporated into the block group average yard size.

Some overall San Jose residential yard size statistics:

Mean San Jose Yard Area per Capita: 386 sq. ft.

Standard Deviation: 660 sq. ft.

Minimum San Jose Yard Size per Capita: 0 sq. ft.

Maximum San Jose Yard Size per Capita: 6300 sq. ft.

For now, I will put a hold on this analysis and move to the average number of people who visit San Jose parks that reside in these block groups. At the end, we will come back to these yard areas to compare against other accessibility factors.

Methodology: Obtaining CBG Visits to San Jose Parks

When looking at park visitors who live in specific census block groups (CBGs), it is not sufficient to consider the CBG visit numbers at face value. The population of each block group plays an important factor in weighing a community’s visits to parks. For example, if a CBG has 3 residents, and they have an average of 3 daily park visits, then we can infer that this community has very active park visitors (even though the 3 park visits may seem low).

The following map displays all of the census block groups we considered in this analysis (see the Safegraph Data Collection Explanation section above for why some CBGs are missing). The “Park Visits” option illustrates the average daily number of visits that each CBG had to a SJ park since January 1, 2020. The “Park Visits per 1000” displays a ratio, which provides more insight into how actively a community visits parks (a higher ratio = more active, a lower ratio = less active).

####periodically update from sanjose_park_process file
park_origins <- readRDS("sj_park_origins.rds") %>%
  filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group) %>%
  mutate(month = month(date %>% as.Date())) 

# park_origins_monthly <-
#   park_origins %>%
#   group_by(origin_census_block_group,month) %>%
#   summarise(month_total = sum(mean_origin_visits)) %>%
#   filter(month != 7)

park_geometry <-
  readRDS('park_output_geometry.rds') %>%
  st_as_sf() %>%
  st_transform(projection)
park_origin_mean <-
  park_origins %>%
  group_by(origin_census_block_group) %>%
  summarise(mean_visits = mean(mean_origin_visits))

visit_pop_combo <-
  sj_population %>%
  filter(origin_census_block_group %in% park_origin_mean$origin_census_block_group) %>%
  left_join(
    park_origin_mean,
    by = "origin_census_block_group"
  ) %>%
  na.omit() %>%
  mutate(
    visits_per_pop = round((mean_visits/total_pop)*1000)
  ) %>%
  left_join(
    sj_blockgroup %>%
      dplyr::select(-DISTRICTS),
    by = "origin_census_block_group"
  ) %>%
  st_as_sf() %>%
  st_transform(projection)

orange_pal <- colorNumeric(
  palette = colorRamp(c("#fcd786", "#a67100"), interpolate="spline"),
  domain =
    visit_pop_combo %>%
    pull(mean_visits) %>%
    unique()
)

teal_pal <- colorNumeric(
  palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
  domain =
    visit_pop_combo %>%
    pull(visits_per_pop) %>%
    unique()
)

all_visitors_map <-
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
 addPolygons(
    data = sj_boundary %>% st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = visit_pop_combo %>%
      st_transform(4326),
    fillColor = ~orange_pal(mean_visits),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "Average Daily San Jose Park Visits",
    label = ~paste0(round(mean_visits)," average daily park visits"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )) %>%
  addPolygons(
    data = visit_pop_combo %>%
      st_transform(4326),
    fillColor = ~teal_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "Average Park Visits per 1000",
    label = ~paste0(visits_per_pop," average daily park visits per 1000"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLayersControl(
    baseGroups = c("Average Daily San Jose Park Visits", "Average Park Visits per 1000"),
    options = layersControlOptions(collapsed = FALSE)
  )

all_visitors_map

Some interesting results to highlight:

The block group that has both the highest average daily park visits and highest visit/1000 people ratio is shown in the following view. Interestingly, this CBG contains the Santa Clara County jail and police office. Approximately 60% of the “residents” of this block group visit a San Jose park everyday. But who are these visitors? Inmates are not likely to have a cellular device. After a conversation with Stanford’s Reglab, we hypothesize that there are certain devices (tablets or phones) that police and correction officers use during the workday, but leave at the office during the night (which becomes the device’s home).

all_visitors_map %>%
  setView(-121.906408, 37.350420, zoom = 15)

An additional block group that stands out is one located near Emma Prusch Farm Park. This CBG contains mainly an on-ramp, a portion of Bayshore Freeway, and a storage center. In the “Average Daily Park Visits” view, this CBG does not stand out. But when you switch to the “Average Park Visits per 1000” view, the CBG shows to have the second highest visits/population ratio in all of San Jose (although the daily park visit average is only 8, 53% of those people visit parks everyday). This may be an instance of a homeless encampment underneath or near the freeway on-ramp, whose residents visit the nearby Emma Prusch Farm Park quite frequently.

homeless <-
  all_visitors_map %>%
  showGroup("Average Park Visits per 1000") %>%
  setView(-121.847157, 37.335857, zoom = 15)

homeless

Initial Findings: Communities with the Lowest and Highest Park Visits

What communities visit parks more or less frequently than others? We will now directly compare the CBGs with the lowest and highest park visits per 1000 ratios since January 1, 2020.

We considered a low visits per 1000 ratio to be less than 20, and a high ratio to be more than 110 (one SD less/more than the mean, respectively).

lowest_visitors <-
  visit_pop_combo %>%
  filter(visits_per_pop < mean(visit_pop_combo$visits_per_pop)-sd(visit_pop_combo$visits_per_pop)) %>%
  st_as_sf() %>%
  st_transform(projection)

purple_pal <- colorNumeric(
  palette = colorRamp(c("#b27ef2", "#4900a3"), interpolate="spline"),
  domain =
    lowest_visitors %>%
    pull(visits_per_pop) %>%
    unique()
)

highest_visitors <-
  visit_pop_combo %>%
  filter(visits_per_pop > mean(visit_pop_combo$visits_per_pop)+sd(visit_pop_combo$visits_per_pop)) %>%
  st_as_sf() %>%
  st_transform(projection)

red_pal <- colorNumeric(
  palette = colorRamp(c("#ff8585", "#9c0000"), interpolate="spline"),
  domain =
    highest_visitors %>%
    pull(visits_per_pop) %>%
    unique()
)

extremes_visitors_map <- 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
 addPolygons(
    data = sj_boundary %>%
      st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = lowest_visitors %>%
      st_transform(4326),
    fillColor = ~purple_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "CBGs with Lowest Park Visit Ratio",
    label = ~paste0(visits_per_pop," average daily park visits per 1000"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addPolygons(
    data = highest_visitors %>%
      st_transform(4326),
    fillColor = ~red_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "CBGs with Highest Park Visit Ratio",
    label = ~paste0(visits_per_pop," average daily park visits per 1000"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLayersControl(
    overlayGroups = c("CBGs with Lowest Park Visit Ratio", "CBGs with Highest Park Visit Ratio"),
    options = layersControlOptions(collapsed = FALSE)
  )

extremes_visitors_map
#mapview(lowest_visitors, zcol = "visits_per_pop")

In the geographic sense, there appears to be no specific CBG pattern that would allow for predictive classification of a low or high visit/population ratio.

Methodology: Distance from San Jose CBGs to a Park

Our main goal for this section is to find the distance and walking duration of the nearest park from each San Jose census block group. We first start by finding the nearest point on a park boundary to the centroid of each census block group. The following map provides a visual representation of this process. The red shape is the census block group, the green shape is a park, and the purple line represents the shortest distance from the center of the block group to the edge of the park.

sj_bg_coords <-
  sj_blockgroup %>%
  st_centroid()

temp_nearest <- st_nearest_points(sj_bg_coords$geometry[1],park_geometry$SHAPE[1])

mapview(temp_nearest) + mapview(sj_blockgroup[1,], col.regions = "red") + mapview(park_geometry[1,], col.regions = "green")

This distance is “as the crow flies”, or straight-line distance, which is not an accurate representation of the actual route one would take in a urban environment. Taking into account the complex nature of streets, we use OSRM routing to calculate walking distances and durations between census block groups and the nearest point on a park boundary. The following map illustrates the more refined route that results from OSRM:

# 
# nearest_points <- data.frame(origin_census_block_group = NA, geometry = NA, park_name = NA, nearest_point = NA)
# 
# for(i in 1:nrow(sj_bg_coords)){
#   for(j in 1:nrow(park_geometry)){
#     temp_df <- data.frame(
#       origin_census_block_group = sj_bg_coords$origin_census_block_group[i],
#       geometry = sj_bg_coords$geometry[i],
#       park_name = park_geometry$location_name[j]
#     ) 
#     
#     temp_df <-
#       temp_df %>%
#       mutate(
#         nearest_point = ifelse(length(st_intersection(st_nearest_points(sj_bg_coords$geometry[i],park_geometry$SHAPE[j]),park_geometry$SHAPE[j])) == 0, NA, st_intersection(st_nearest_points(sj_bg_coords$geometry[i],park_geometry$SHAPE[j]),park_geometry$SHAPE[j]))
#       )
# 
#     nearest_points <-
#       nearest_points %>%
#       rbind(temp_df)
#   }
# 
#   print(i)
# 
# }
# 
# nearest_points <-
#   nearest_points %>%
#   na.omit()


#save(nearest_points, file = "sj_park_bg_nearest_points.Rdata")
#load("sj_park_bg_nearest_points.Rdata")

# 
# 
# cbg_names <-
#   nearest_points %>%
#   dplyr::select(origin_census_block_group)
# 
# park_names <-
#   nearest_points %>%
#   dplyr::select(park_name)
# 
# nearest_points_cbg <-
#   nearest_points %>%
#   st_as_sf() %>%
#   st_set_crs(projection) %>%
#   st_transform(4326) %>%
#   rename(
#     block_group_centroid = geometry
#   ) %>%
#   dplyr::select(origin_census_block_group,block_group_centroid) %>%
#   st_coordinates() %>%
#   cbind(cbg_names) %>%
#   rename(
#     "latcbg" = Y,
#     "loncbg" = X
#   )
# 
# nearest_points_park <-
#   nearest_points %>%
#   st_set_geometry("nearest_point") %>%
#   st_as_sf() %>%
#   st_set_crs(projection) %>%
#   st_transform(4326) %>%
#   st_centroid() %>%
#   dplyr::select(park_name,nearest_point) %>%
#   st_coordinates() %>%
#   cbind(park_names) %>%
#   rename(
#     "latpark" = Y,
#     "lonpark" = X
#   )
#   
#   
# nearest_points_all <-
#   nearest_points_cbg %>%
#   cbind(nearest_points_park) %>%
#   na.omit()
# 
# 
# dist_park_osrm <-
#    1:nrow(nearest_points_all) %>%
#   map_dfr(function(i){
#     if(i%%50 == 0){print(i)}
#     osrmRoute(
#       src = nearest_points_all[i, c("origin_census_block_group", "loncbg","latcbg")],
#       dst = nearest_points_all[i, c("park_name", "lonpark","latpark")],
#       returnclass="sf"
#     ) %>%
#       as.data.frame()
#   }) %>% 
#    st_as_sf() %>% 
#    st_set_crs(4326)

#save(dist_park_osrm, file = "sj_osrm_park_dist.Rdata")
load("sj_osrm_park_dist.Rdata")

mapview(dist_park_osrm[1,]) + mapview(sj_blockgroup[1,], col.regions = "red") + mapview(park_geometry[1,], col.regions = "green")

The OSRM method assumes a fixed walking speed of 5 km/hr, or about 20 minutes per mile. We used this method to find the nearest park to each census block group. The following map displays all of the census block groups in San Jose, color coded by the distance to the nearest park.

min_dist_park_osrm <-
  dist_park_osrm %>%
  as.data.frame() %>%
  group_by(src) %>%
  summarize( 
    rank = which.min(distance),
    distance = distance[rank],
    duration = duration[rank],
    park_name = dst[rank]) %>%
  rename(
    origin_census_block_group = src
  ) %>%
  dplyr::select(-rank) %>%
  left_join(
    sj_blockgroup,
    by = "origin_census_block_group"
  ) %>%
  st_as_sf() %>%
  mutate(distance = distance / 1.60934)
  


green_pal <- colorNumeric(
  palette = colorRamp(c("#95f599", "#007804"), interpolate="spline"),
  domain =
    min_dist_park_osrm %>%
    pull(distance) %>%
    unique()
)

min_dist_map <- 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
 addPolygons(
    data = sj_boundary %>%
      st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = min_dist_park_osrm %>%
      st_transform(4326),
    fillColor = ~green_pal(distance),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    #group = "CBGs with Lowest Park Visit Ratio",
    label = ~paste0("The closest park is ",park_name,", which is ",round(distance,2), " miles away."),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLegend(
    position = "topright", 
    pal = green_pal, 
    values = min_dist_park_osrm$distance,
    title = "Distance to Nearest Park (mi)",
    opacity = 1
  )
  

min_dist_map
# park_centroid <-
#   park_geometry %>%
#   st_centroid()
# 
# park_isochrone <-
#   1:nrow(park_centroid) %>%
#   map_dfr(function(row){
#      print(row)
#     park_centroid[row,] %>%
#       st_transform(4326) %>%
#       osrmIsochrone(
#         returnclass="sf",
#         breaks = 10,
#         res = 20
#       ) %>%
#       as.data.frame()
#   }) %>%
#   st_as_sf() %>%
#   st_set_crs(4326)
# 
# save(park_isochrone, file = "sj_10min_park_isochrone.Rdata")

load("sj_10min_park_isochrone.Rdata")

According to Avi Yotam, who is the San Jose Parks Division Acting Deputy Director, a goal in parks and recreation departments across the nation is having a park within a 10-minute walk of all communities. By using OSRM Routing, we created isochrones that represent a 10 minute walk from each San Jose Park centroid (note: eventually, we will update this analysis to reflect isochrones emanating from the boundaries of the parks instead of the centroid. For large parks, such as Alum Rock Park, the 10 minute isochrone from the centroid does not result in an accurate representation of walking distance to communities because it remains within the park boundary). In the following map, the “10 Minute Walking Distance from SJ Park Centroids” option displays San Jose parks along with their 10 minute walking isochrones. The “CBGs within a 10 min. walk of a park” shows as the title describes.

walk_10min <-
  dist_park_osrm %>%
  as.data.frame() %>%
  st_as_sf() %>%
  st_drop_geometry() %>%
  filter(duration <= 10) %>%
  rename(
    origin_census_block_group = src,
    park_name = dst
  ) %>%
  group_by(origin_census_block_group) %>%
  summarise(duration = mean(duration)) %>%
  left_join(
    sj_blockgroup,
    by = "origin_census_block_group"
  ) %>%
  st_as_sf()
  
green_pal2 <- colorNumeric(
  palette = colorRamp(c("#95f599", "#007804"), interpolate="spline"),
  domain =
    walk_10min %>%
    pull(duration) %>%
    unique()
)

min_10duration_map <- 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
 addPolygons(
    data = sj_boundary %>%
      st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = walk_10min %>%
      st_transform(4326),
    fillColor = ~green_pal2(duration),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "CBGs within a 10 min. walk of a park",
    label = ~paste0("Walking time to the nearest park: ",round(duration)," mintues"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addPolygons(
    data = park_isochrone %>%
      st_transform(4326),
    fillColor = "#93c400",
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "10 Minute Walking Distance from SJ Park Centroids",
    label = "10 Minute Walk Isochrone",
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      ))%>%
  addPolygons(
    data = park_geometry %>%
      st_transform(4326),
    fillColor = "green",
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "10 Minute Walking Distance from SJ Park Centroids",
    label = ~paste0(location_name),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      ))%>%
  addLayersControl(
    baseGroups = c("10 Minute Walking Distance from SJ Park Centroids", "CBGs within a 10 min. walk of a park"),
    options = layersControlOptions(collapsed = FALSE)
  )

min_10duration_map

Distance Calculation Comparison: OSRM Routing vs. “As the Crow Flies”

How does the OSRM Routing distance calculation method compare to the “as the crow flies” straight-line method? To illustrate the difference, we will use the distances between San Jose CBGs and their nearest park (displayed in the maps in the previous section). In almost every case, the OSRM distance is larger than the straight line distance, so the the numbers in the map legend result from: OSRM - as the crow flies. Note that the delta is greater in the CBGs on the periphery of San Jose.

#Comparison: As the crow flies and OSRM
# matched_points <- data.frame(origin_census_block_group = NA, geometry = NA, park_name = NA, nearest_point = NA)
# 
# nearest_points <-
#   nearest_points %>%
#   as.data.frame()
# 
# for(i in 1:nrow(nearest_points)){
#   if(i%%500 == 0){print(i)}
#   for(j in 1:nrow(min_dist_park_osrm)){
#     if(nearest_points$origin_census_block_group[i] == min_dist_park_osrm$origin_census_block_group[j] & nearest_points$park_name[i] == min_dist_park_osrm$park_name[j]){
#       matched_points <-
#         matched_points %>%
#         rbind(nearest_points[i,])
#     }
#   }
#   
# }
# 
# 
# nearest_points_dist <-
#   matched_points %>%
#   na.omit() %>%
#   st_set_geometry("geometry") %>%
#   st_as_sf() %>%
#   st_set_crs(projection) %>%
#   st_transform(4326) %>%
#   st_set_geometry("nearest_point") %>%
#   st_as_sf() %>%
#   st_set_crs(projection) %>%
#   st_transform(4326) %>%
#   mutate(
#     crow_dist = 0
#   )
# 
# for(i in 1:nrow(nearest_points_dist)){
#   nearest_points_dist$crow_dist[i] = as.numeric(st_length(st_nearest_points(nearest_points_dist$geometry[i],nearest_points_dist$nearest_point[i]))) / 1609.34
# }
# 
# nearest_points_dist_mi <-
#   nearest_points_dist %>%
#   mutate(
#     crow_dist = as.numeric(crow_dist / 1609.34) #convert to miles
#   )
# 
# save(nearest_points_dist, file = "sj_parks_crow_dist.Rdata")

load("sj_parks_crow_dist.Rdata")

osrm_crow <-
  nearest_points_dist %>%
  left_join(
    min_dist_park_osrm %>%
      st_set_geometry(NULL) %>%
      dplyr::select(origin_census_block_group,distance),
    by = "origin_census_block_group"
  ) %>%
  mutate(dist_diff = distance - crow_dist) %>%
  st_set_geometry(NULL) %>%
  st_set_geometry("geometry") %>% 
  st_set_geometry(NULL) %>%
  left_join(
    sj_blockgroup,
    by = "origin_census_block_group"
  )


teal_pal2 <- colorNumeric(
  palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
  domain =
    osrm_crow %>%
    pull(dist_diff) %>%
    unique()
)

osrm_crow_map <- 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
 addPolygons(
    data = sj_boundary %>%
      st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = osrm_crow %>%
      st_as_sf() %>%
      st_transform(4326),
    fillColor = ~teal_pal2(dist_diff),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    label = ~paste0("Delta between OSRM and 'as the crow flies': ",round(dist_diff,2)," miles"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLegend(
    position = "topright", 
    pal = teal_pal2, 
    values = osrm_crow$dist_diff,
    title = "Delta between OSRM and 'as the crow flies' distances",
    opacity = 1
  )

osrm_crow_map

To illustrate the difference further, the following scatter plot directly compares the distance values for each block group. The solid black line represents a 1:1 relationship, which the data would follow if the OSRM/as the crow flies values were identical. The points deviate away from the 1:1 line as the distances become greater, indicating that the OSRM distances are a more accurate representation of actual travel for parks that are further away from a CBG.

dist_scatter <-
  osrm_crow %>% 
  ggplot(
    aes(
      x = crow_dist,
      y = distance
    )
  ) +
  geom_abline(
  mapping = NULL,
  data = NULL,
  1,
  0,
  na.rm = FALSE,
  show.legend = TRUE
) +
  geom_point() +
  labs(
    x = "As the Crow Flies Distance (mi.)",
    y = "OSRM Routing Distance (mi.)",
    title = "OSRM Routing and `As the Crow Flies` Distance Comparisons (CBGs to Nearest SJ Park)"
  )

dist_scatter

Comparison: Residential Yard Area vs. Park Visit per Population Ratio

Hypothesis: People who have larger yards visit parks less.

The following scatter plot represents a linear regression between the average residential yard size and average park visits per 1000 people for each block group in San Jose. There is a significant positive correlation between these two factors (p-value < 0.05), representing that in general, people who live in communities that have a larger yard area per capita tend to visit parks more than communities with smaller yard areas per capita. This correlation seems somewhat counterintuitive; we hypothesized that if one is lacking yard space, then that person would be more likely to seek open space in parks. In our next iteration of this report, we plan to account for confounding variables, such as median household income, race/ethnicity, and family make-up of San Jose CBGs. See the linear model summary after the plot for the statistical results.

yard_visits_tied <-
  visit_pop_combo %>%
  left_join(
    sj_yard_pop_weight %>%
      st_set_geometry(NULL) %>%
      dplyr::select(yard_per_capita, origin_census_block_group),
    by = "origin_census_block_group"
  ) %>%
  filter(yard_per_capita != 0 & yard_per_capita < 2600)

lim_mod <- lm(yard_visits_tied$visits_per_pop ~ yard_visits_tied$yard_per_capita)

lim_scatter <-
  yard_visits_tied %>% 
  ggplot(
    aes(
      x = yard_per_capita,
      y = visits_per_pop
    )
  ) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(
    x = "CBG Yard Area per Capita (sqft.)",
    y = "CBG Average Daily Park Visits per 1000",
    title = "SMC Park Accessibility: Parks Visits per 1000 vs. Residential Yard Area per Population"
  )

lim_scatter

summary(lim_mod)
## 
## Call:
## lm(formula = yard_visits_tied$visits_per_pop ~ yard_visits_tied$yard_per_capita)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.546 -20.489  -9.299  14.202 202.951 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      56.903133   2.446871  23.255  < 2e-16 ***
## yard_visits_tied$yard_per_capita  0.008484   0.003157   2.688  0.00754 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.48 on 348 degrees of freedom
## Multiple R-squared:  0.02033,    Adjusted R-squared:  0.01752 
## F-statistic: 7.223 on 1 and 348 DF,  p-value: 0.007543

Note: In the above scatter plot, a 2,600 sqft yard area per capita threshold is implemented in order to exclude outliers. The following histogram depicts all yard area per capita values. The main curve ends at 2,300 sqft, and the outlier clusters begin around 2,900 sqft. As mentioned earlier, these large values may be attributed to zoning misclassifications, so we do not include them in the linear model.

yard_hist <- plot_ly(
  x = sj_yard_pop_weight$yard_per_capita, 
  type = "histogram") %>%
  layout(
    title = "San Jose CBG Yard Area per Capita Distribution",
    shapes=list(type='line', x0= 2600, x1= 2600, y0=0, y1=316, line=list(dash='dot', width=1)),
         xaxis = list(
           title = "Yard Area per Capita (sqft.)"
         ),
         yaxis = list(
           title = "Number of CBG Occurances"
         ),
         annotations = list(
           list(
             xref = "x",
             yref = "y",
             x = 2600,
             y = 316,
             text = "Threshold",
             xanchor = "center",
             yanchor = "bottom",
             showarrow = F
           )
         )
  )

yard_hist

Comparison: Residential Yard Area vs. Minimum Distance to a Park

Hypothesis: People who live closer to parks have smaller yard areas.

The following scatter plot shows a significant positive correlation (p-value < 0.05) between residential yard size and distance to the nearest park. As mentioned above, we plan to account for confounding demographic variables in the next iteration of this report.

yard_dist_tied <-
  min_dist_park_osrm %>%
  left_join(
    sj_yard_pop_weight %>%
      st_set_geometry(NULL) %>%
      dplyr::select(yard_per_capita, origin_census_block_group),
    by = "origin_census_block_group"
  ) %>% 
  filter(yard_per_capita != 0 & yard_per_capita < 2600)

lim_mod2 <- lm(yard_dist_tied$distance ~ yard_dist_tied$yard_per_capita)

lim_scatter2 <-
  yard_dist_tied %>% 
  ggplot(
    aes(
      x = yard_per_capita,
      y = distance
    )
  ) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(
    x = "CBG Yard Area per Capita (sqft.)",
    y = "Nearest Park Distance (mi.)",
    title = "SMC Park Accessibility: Nearest Park Distance vs. Residential Yard Area per Population"
  )

lim_scatter2

summary(lim_mod2)
## 
## Call:
## lm(formula = yard_dist_tied$distance ~ yard_dist_tied$yard_per_capita)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8658 -0.2914 -0.0547  0.1945  3.8519 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.339e-01  3.644e-02  14.654  < 2e-16 ***
## yard_dist_tied$yard_per_capita 2.754e-04  4.701e-05   5.859 1.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4837 on 348 degrees of freedom
## Multiple R-squared:  0.08978,    Adjusted R-squared:  0.08716 
## F-statistic: 34.33 on 1 and 348 DF,  p-value: 1.081e-08

Comparison: Residential Yard Area vs. Park Visits per 1000 People

Hypothesis: Those who live closer to parks visit parks more often.

The following scatter plot shows a weak negative correlation (p-value = 0.14) between residential yard size and parks visits per 1000 people, which indicates that those who live in communities closer to parks visit parks frequently.

pop_dist_tied <-
  min_dist_park_osrm %>%
  left_join(
    visit_pop_combo %>%
      st_set_geometry(NULL) %>%
      dplyr::select(visits_per_pop, origin_census_block_group),
    by = "origin_census_block_group"
  ) 

lim_mod3 <- lm(pop_dist_tied$visits_per_pop ~ pop_dist_tied$distance)

lim_scatter3 <-
  pop_dist_tied %>% 
  ggplot(
    aes(
      x = distance,
      y = visits_per_pop
    )
  ) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(
    x = "Nearest Park Distance (mi.)",
    y = "CBG Average Daily Park Visits per 1000",
    title = "SMC Park Accessibility: Nearest Park Distance vs. Parks Visits per 1000"
  )

lim_scatter3

summary(lim_mod3)
## 
## Call:
## lm(formula = pop_dist_tied$visits_per_pop ~ pop_dist_tied$distance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55.07 -23.48  -9.03  11.06 538.51 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              68.812      3.222  21.354   <2e-16 ***
## pop_dist_tied$distance   -5.763      3.917  -1.471    0.142    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.92 on 567 degrees of freedom
## Multiple R-squared:  0.003803,   Adjusted R-squared:  0.002046 
## F-statistic: 2.164 on 1 and 567 DF,  p-value: 0.1418